| 1 |  |  | const crawler_file_tester = { | 
            
                                                                                                            
                            
            
                                    
            
            
                | 2 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 3 |  |  |     robot_rules: [], | 
            
                                                                                                            
                            
            
                                    
            
            
                | 4 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 5 |  |  |     /** | 
            
                                                                                                            
                            
            
                                    
            
            
                | 6 |  |  |      * Parse the content of the robots file | 
            
                                                                                                            
                            
            
                                    
            
            
                | 7 |  |  |      * | 
            
                                                                                                            
                            
            
                                    
            
            
                | 8 |  |  |      * @param {*} result | 
            
                                                                                                            
                            
            
                                    
            
            
                | 9 |  |  |      * @throws {Exception} | 
            
                                                                                                            
                            
            
                                    
            
            
                | 10 |  |  |      */ | 
            
                                                                                                            
                            
            
                                    
            
            
                | 11 |  |  |     parse_robots_file: function(result){ | 
            
                                                                                                            
                            
            
                                    
            
            
                | 12 |  |  |         var rules = result.split("\n"); | 
            
                                                                                                            
                            
            
                                    
            
            
                | 13 |  |  |         $('#robots-check').addClass('text-success').append('<span class="glyphicon glyphicon-ok-circle"> </span>'); | 
            
                                                                                                            
                            
            
                                    
            
            
                | 14 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 15 |  |  |         var agent = '*'; | 
            
                                                                                                            
                            
            
                                    
            
            
                | 16 |  |  |         for(var r in rules){ | 
            
                                                                                                            
                            
            
                                    
            
            
                | 17 |  |  |             if( rules[r].length < 1 || rules[r].toLowerCase().indexOf('sitemap:') >= 0 ){ | 
            
                                                                                                            
                            
            
                                    
            
            
                | 18 |  |  |                 continue; | 
            
                                                                                                            
                            
            
                                    
            
            
                | 19 |  |  |             }else if( rules[r].toLowerCase().indexOf('user-agent:') >= 0 ){ | 
            
                                                                                                            
                            
            
                                    
            
            
                | 20 |  |  |                 agent = rules[r].replace(/user-agent:/gi, '').replace(/^\s+|\s+$|\s+(?=\s)/g, ''); | 
            
                                                                                                            
                            
            
                                    
            
            
                | 21 |  |  |             }else if( rules[r].toLowerCase().indexOf('disallow:') >= 0 ){ | 
            
                                                                                                            
                            
            
                                    
            
            
                | 22 |  |  |                 var rule = | 
            
                                                                                                            
                            
            
                                    
            
            
                | 23 |  |  |                     '^'+rules[r] | 
            
                                                                                                            
                            
            
                                    
            
            
                | 24 |  |  |                     .replace(/disallow:/gi, '') // remove disallow | 
            
                                                                                                            
                            
            
                                    
            
            
                | 25 |  |  |                     .replace(/^\s+|\s+$|\s+(?=\s)/g, '') // remove white space | 
            
                                                                                                            
                            
            
                                    
            
            
                | 26 |  |  |                     .replace('?', '\\?') // escape query string start | 
            
                                                                                                            
                            
            
                                    
            
            
                | 27 |  |  |                     .replace('|', '\\|') // escape pipe | 
            
                                                                                                            
                            
            
                                    
            
            
                | 28 |  |  |                     .replace('/', '\\/') // escape slashes | 
            
                                                                                                            
                            
            
                                    
            
            
                | 29 |  |  |                     .replace(/^\^\^/g, '^') // If it already had a caret remove it | 
            
                                                                                                            
                            
            
                                    
            
            
                | 30 |  |  |                     .replace(/^(\*)/g, '(.*?)'); // Replace star with match anything modifier | 
            
                                                                                                            
                            
            
                                    
            
            
                | 31 |  |  |                 crawler_file_tester.robot_rules.push({ 'rule': rule, 'agent': agent, 'original': rules[r] }); | 
            
                                                                                                            
                            
            
                                    
            
            
                | 32 |  |  |             }else{ | 
            
                                                                                                            
                            
            
                                    
            
            
                | 33 |  |  |                 console.log(rules[r]); | 
            
                                                                                                            
                            
            
                                    
            
            
                | 34 |  |  |                 throw "Found a rule which we don't understand. Report it to the developer"; | 
            
                                                                                                            
                            
            
                                    
            
            
                | 35 |  |  |             } | 
            
                                                                                                            
                            
            
                                    
            
            
                | 36 |  |  |         } | 
            
                                                                                                            
                            
            
                                    
            
            
                | 37 |  |  |     }, | 
            
                                                                                                            
                            
            
                                    
            
            
                | 38 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 39 |  |  |     /** | 
            
                                                                                                            
                            
            
                                    
            
            
                | 40 |  |  |      * Check all tested url and see if they are blocked by any rule in the robots file | 
            
                                                                                                            
                            
            
                                    
            
            
                | 41 |  |  |      * | 
            
                                                                                                            
                            
            
                                    
            
            
                | 42 |  |  |      * @returns {undefined} | 
            
                                                                                                            
                            
            
                                    
            
            
                | 43 |  |  |      */ | 
            
                                                                                                            
                            
            
                                    
            
            
                | 44 |  |  |     test_blocked_pages: function(){ | 
            
                                                                                                            
                            
            
                                    
            
            
                | 45 |  |  |         for(var t in crawler.tested){ | 
            
                                                                                                            
                            
            
                                    
            
            
                | 46 |  |  |             var url = crawler.tested[t]; | 
            
                                                                                                            
                            
            
                                    
            
            
                | 47 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 48 |  |  |             if( crawler.linked_from.hasOwnProperty(url) ) { | 
            
                                                                                                            
                            
            
                                    
            
            
                | 49 |  |  |                 for (var r in this.robot_rules) { | 
            
                                                                                                            
                            
            
                                    
            
            
                | 50 |  |  |                     var regex = new RegExp(this.robot_rules[r]['rule'], 'g'); | 
            
                                                                                                            
                            
            
                                    
            
            
                | 51 |  |  |                     if (regex.test('/' + url)) { | 
            
                                                                                                            
                            
            
                                    
            
            
                | 52 |  |  |                         var link    = crawler.painter.create_link(url, url), | 
            
                                                                                                            
                            
            
                                    
            
            
                | 53 |  |  |                             status  = crawler.painter.create_status('error', 'Page has links and is blocked in robots'), | 
            
                                                                                                            
                            
            
                                    
            
            
                | 54 |  |  |                             agent   = ( this.robot_rules[r]['agent'] == '*' ) ? 'ALL BOTS' : this.robot_rules[r]['agent']; | 
            
                                                                                                            
                            
            
                                    
            
            
                | 55 |  |  |                         crawler.painter.add_row( | 
            
                                                                                                            
                            
            
                                    
            
            
                | 56 |  |  |                             'blocked_pages', | 
            
                                                                                                            
                            
            
                                    
            
            
                | 57 |  |  |                             [link, crawler.linked_from[url].join(', '), agent, this.robot_rules[r]['original'], status]); | 
            
                                                                                                            
                            
            
                                    
            
            
                | 58 |  |  |                     } | 
            
                                                                                                            
                            
            
                                    
            
            
                | 59 |  |  |                 } | 
            
                                                                                                            
                            
            
                                    
            
            
                | 60 |  |  |             } | 
            
                                                                                                            
                            
            
                                    
            
            
                | 61 |  |  |         } | 
            
                                                                                                            
                            
            
                                    
            
            
                | 62 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 63 |  |  |         return undefined; | 
            
                                                                                                            
                            
            
                                    
            
            
                | 64 |  |  |     }, | 
            
                                                                                                            
                            
            
                                    
            
            
                | 65 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 66 |  |  |     /** | 
            
                                                                                                            
                            
            
                                    
            
            
                | 67 |  |  |      * Parse the content of the sitemap file | 
            
                                                                                                            
                            
            
                                    
            
            
                | 68 |  |  |      * | 
            
                                                                                                            
                            
            
                                    
            
            
                | 69 |  |  |      * @returns undefined | 
            
                                                                                                            
                            
            
                                    
            
            
                | 70 |  |  |      */ | 
            
                                                                                                            
                            
            
                                    
            
            
                | 71 |  |  |     parse_sitemap_file: function(result){ | 
            
                                                                                                            
                            
            
                                    
            
            
                | 72 |  |  |         crawler.sitemap = []; | 
            
                                                                                                            
                            
            
                                    
            
            
                | 73 |  |  |         var ruleset = $($(result).filter('urlset')[0]); | 
            
                                                                                                            
                            
            
                                    
            
            
                | 74 |  |  |         $.each(ruleset.children(), function() { | 
            
                                                                                                            
                            
            
                                    
            
            
                | 75 |  |  |             crawler.sitemap.push( crawler.sanitize($(this).find('loc')[0].innerHTML) ); | 
            
                                                                                                            
                            
            
                                    
            
            
                | 76 |  |  |         }); | 
            
                                                                                                            
                            
            
                                    
            
            
                | 77 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 78 |  |  |         $('#sitemap-check').addClass('text-success').append('<span class="glyphicon glyphicon-ok-circle"> </span>'); | 
            
                                                                                                            
                            
            
                                    
            
            
                | 79 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 80 |  |  |         return undefined; | 
            
                                                                                                            
                            
            
                                    
            
            
                | 81 |  |  |     }, | 
            
                                                                                                            
                            
            
                                    
            
            
                | 82 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 83 |  |  |     /** | 
            
                                                                                                            
                            
            
                                    
            
            
                | 84 |  |  |      * Test the urls in the sitemap | 
            
                                                                                                            
                            
            
                                    
            
            
                | 85 |  |  |      * | 
            
                                                                                                            
                            
            
                                    
            
            
                | 86 |  |  |      * @returns {undefined} | 
            
                                                                                                            
                            
            
                                    
            
            
                | 87 |  |  |      */ | 
            
                                                                                                            
                            
            
                                    
            
            
                | 88 |  |  |     test_sitemap: function(){ | 
            
                                                                                                            
                            
            
                                    
            
            
                | 89 |  |  |         var sitemap = crawler.sitemap; | 
            
                                                                                                            
                            
            
                                    
            
            
                | 90 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 91 |  |  |         for(var u in sitemap){ | 
            
                                                                                                            
                            
            
                                    
            
            
                | 92 |  |  |             var link = crawler.painter.create_link(sitemap[u], sitemap[u]); | 
            
                                                                                                            
                            
            
                                    
            
            
                | 93 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 94 |  |  |             if( crawler.failed.indexOf(sitemap[u]) >= 0 ) { | 
            
                                                                                                            
                            
            
                                    
            
            
                | 95 |  |  |                 var status = crawler.painter.create_status('error', 'Page found in sitemap but is broken'); | 
            
                                                                                                            
                            
            
                                    
            
            
                | 96 |  |  |                 crawler.painter.add_row('sitemap', [link, status]); | 
            
                                                                                                            
                            
            
                                    
            
            
                | 97 |  |  |                 continue; | 
            
                                                                                                            
                            
            
                                    
            
            
                | 98 |  |  |             } | 
            
                                                                                                            
                            
            
                                    
            
            
                | 99 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 100 |  |  |             if( crawler.tested.indexOf(sitemap[u]) < 0 ){ | 
            
                                                                                                            
                            
            
                                    
            
            
                | 101 |  |  |                 var status = crawler.painter.create_status('warning', 'Page found in sitemap but not found by crawler'); | 
                            
                    |  |  |  | 
                                                                                        
                                                                                     | 
            
                                                                                                            
                            
            
                                    
            
            
                | 102 |  |  |                 crawler.painter.add_row('sitemap', [link, status]); | 
            
                                                                                                            
                            
            
                                    
            
            
                | 103 |  |  |                 continue; | 
            
                                                                                                            
                            
            
                                    
            
            
                | 104 |  |  |             } | 
            
                                                                                                            
                            
            
                                    
            
            
                | 105 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 106 |  |  |             if( !crawler.linked_from.hasOwnProperty(sitemap[u]) ){ | 
            
                                                                                                            
                            
            
                                    
            
            
                | 107 |  |  |                 var status = crawler.painter.create_status('info', 'Page found in sitemap but has no links on the site'); | 
                            
                    |  |  |  | 
                                                                                        
                                                                                     | 
            
                                                                                                            
                            
            
                                    
            
            
                | 108 |  |  |                 crawler.painter.add_row('sitemap', [link, status]); | 
            
                                                                                                            
                            
            
                                    
            
            
                | 109 |  |  |             } | 
            
                                                                                                            
                            
            
                                    
            
            
                | 110 |  |  |         } | 
            
                                                                                                            
                            
            
                                    
            
            
                | 111 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 112 |  |  |         return undefined; | 
            
                                                                                                            
                            
            
                                    
            
            
                | 113 |  |  |     }, | 
            
                                                                                                            
                            
            
                                    
            
            
                | 114 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 115 |  |  |     /** | 
            
                                                                                                            
                            
            
                                    
            
            
                | 116 |  |  |      * Setup an ajax call to fetch url | 
            
                                                                                                            
                            
            
                                    
            
            
                | 117 |  |  |      * | 
            
                                                                                                            
                            
            
                                    
            
            
                | 118 |  |  |      * @param {string} url | 
            
                                                                                                            
                            
            
                                    
            
            
                | 119 |  |  |      * @param {function} callback | 
            
                                                                                                            
                            
            
                                    
            
            
                | 120 |  |  |      * @param {function} failed_callback | 
            
                                                                                                            
                            
            
                                    
            
            
                | 121 |  |  |      * | 
            
                                                                                                            
                            
            
                                    
            
            
                | 122 |  |  |      * @returns {undefined} | 
            
                                                                                                            
                            
            
                                    
            
            
                | 123 |  |  |      */ | 
            
                                                                                                            
                            
            
                                    
            
            
                | 124 |  |  |     get_file_contents: function(url, callback, failed_callback){ | 
            
                                                                                                            
                            
            
                                    
            
            
                | 125 |  |  |         var t = $.ajax({ | 
                            
                    |  |  |  | 
                                                                                        
                                                                                     | 
            
                                                                                                            
                            
            
                                    
            
            
                | 126 |  |  |             'url': crawler.get_proxy('/seotest/getPage?u='+url+'&agent='+crawler.agent) | 
            
                                                                                                            
                            
            
                                    
            
            
                | 127 |  |  |         }).done(callback).fail(failed_callback); | 
            
                                                                                                            
                            
            
                                    
            
            
                | 128 |  |  |         return undefined; | 
            
                                                                                                            
                            
            
                                    
            
            
                | 129 |  |  |     } | 
            
                                                                                                            
                            
            
                                    
            
            
                | 130 |  |  | }; | 
            
                                                                                                            
                            
            
                                    
            
            
                | 131 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 132 |  |  | // Register the tests | 
            
                                                                                                            
                            
            
                                    
            
            
                | 133 |  |  | crawler.event_handler.on('BEFORE_INIT', function(){ | 
            
                                                                                                            
                            
            
                                    
            
            
                | 134 |  |  |     crawler.regiser_test('blocked_pages', 'BLOCKED PAGES', ['URL', 'Linked From', 'Blocked For', 'Blocked By', 'Status'], false); | 
            
                                                                                                            
                            
            
                                    
            
            
                | 135 |  |  |     crawler.painter.set_type('blocked_pages', 'default'); | 
            
                                                                                                            
                            
            
                                    
            
            
                | 136 |  |  |     crawler.regiser_test('sitemap', 'SITEMAP', ['URL', 'Status'], false); | 
            
                                                                                                            
                            
            
                                    
            
            
                | 137 |  |  |     crawler.painter.set_type('sitemap', 'default'); | 
            
                                                                                                            
                            
            
                                    
            
            
                | 138 |  |  | }); | 
            
                                                                                                            
                            
            
                                    
            
            
                | 139 |  |  |  | 
            
                                                                                                            
                                                                
            
                                    
            
            
                | 140 |  |  | // Start up the file testers | 
            
                                                                        
                            
            
                                    
            
            
                | 141 |  |  | crawler.event_handler.on('AFTER_INIT', function(){ | 
            
                                                                        
                            
            
                                    
            
            
                | 142 |  |  |     crawler_file_tester.get_file_contents( | 
            
                                                                        
                            
            
                                    
            
            
                | 143 |  |  |         crawler.robots_url, | 
            
                                                                        
                            
            
                                    
            
            
                | 144 |  |  |         crawler_file_tester.parse_robots_file, | 
            
                                                                        
                            
            
                                    
            
            
                | 145 |  |  |         function(){ $('#robots-check').addClass('text-danger').append('<span class="glyphicon glyphicon-remove-circle"> </span>'); } | 
            
                                                                        
                            
            
                                    
            
            
                | 146 |  |  |     ); | 
            
                                                                        
                            
            
                                    
            
            
                | 147 |  |  |     crawler_file_tester.get_file_contents( | 
            
                                                                        
                            
            
                                    
            
            
                | 148 |  |  |         crawler.sitemap_url, | 
            
                                                                        
                            
            
                                    
            
            
                | 149 |  |  |         crawler_file_tester.parse_sitemap_file, | 
            
                                                                        
                            
            
                                    
            
            
                | 150 |  |  |         function(){ $('#sitemap-check').addClass('text-danger').append('<span class="glyphicon glyphicon-remove-circle"> </span>'); } | 
            
                                                                        
                            
            
                                    
            
            
                | 151 |  |  |     ); | 
            
                                                                        
                            
            
                                    
            
            
                | 152 |  |  | }); | 
            
                                                                                                            
                            
            
                                    
            
            
                | 153 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 154 |  |  | // Test for blocked pages the the crawler finishes | 
            
                                                                                                            
                            
            
                                    
            
            
                | 155 |  |  | crawler.event_handler.on('ALL_CRAWLS_FINISHED', function(){ | 
            
                                                                                                            
                            
            
                                    
            
            
                | 156 |  |  |     crawler_file_tester.test_blocked_pages(); | 
            
                                                                                                            
                            
            
                                    
            
            
                | 157 |  |  |     crawler_file_tester.test_sitemap(); | 
            
                                                                                                            
                            
            
                                    
            
            
                | 158 |  |  | }); | 
            
                                                                                                            
                                                                
            
                                    
            
            
                | 159 |  |  |  | 
            
                                                        
            
                                    
            
            
                | 160 |  |  |  | 
            
                        
This check looks for variables that are declared in multiple lines. There may be several reasons for this.
In the simplest case the variable name was reused by mistake. This may lead to very hard to locate bugs.
If you want to reuse a variable for another purpose, consider declaring it at or near the top of your function and just assigning to it subsequently so it is always declared.